{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# UNSW-NB15 Random Forest Anomaly"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Python ≥3.5 is required\n",
    "import sys\n",
    "assert sys.version_info >= (3, 5)\n",
    "\n",
    "# Scikit-Learn ≥0.20 is required\n",
    "import sklearn\n",
    "assert sklearn.__version__ >= \"0.20\"\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from sklearn.ensemble import VotingClassifier\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.svm import SVC\n",
    "from sklearn.feature_extraction import FeatureHasher\n",
    "\n",
    "# Common imports\n",
    "import numpy as np\n",
    "import os\n",
    "import pandas as pd\n",
    "\n",
    "# to make this notebook's output stable across runs\n",
    "np.random.seed(42)\n",
    "\n",
    "# To plot pretty figures\n",
    "%matplotlib inline\n",
    "import matplotlib as mpl\n",
    "import matplotlib.pyplot as plt\n",
    "mpl.rc('axes', labelsize=14)\n",
    "mpl.rc('xtick', labelsize=12)\n",
    "mpl.rc('ytick', labelsize=12)\n",
    "\n",
    "# Where to save the figures\n",
    "PROJECT_ROOT_DIR = \".\"\n",
    "CHAPTER_ID = \"ensembles\"\n",
    "IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, \"images\", CHAPTER_ID)\n",
    "os.makedirs(IMAGES_PATH, exist_ok=True)\n",
    "\n",
    "def save_fig(fig_id, tight_layout=True, fig_extension=\"png\", resolution=300):\n",
    "    path = os.path.join(IMAGES_PATH, fig_id + \".\" + fig_extension)\n",
    "    print(\"Saving figure\", fig_id)\n",
    "    if tight_layout:\n",
    "        plt.tight_layout()\n",
    "    plt.savefig(path, format=fig_extension, dpi=resolution)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## File System"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [],
   "source": [
    "DATA_DIR =  \"./datasets/unsw/\"\n",
    "testing_fname = \"UNSW_NB15_testing-set.csv\"\n",
    "training_fname = \"UNSW_NB15_training-set.csv\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Reading Files and Column Labeling"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [],
   "source": [
    "df1 = pd.read_csv(DATA_DIR + training_fname)\n",
    "df2 =  pd.read_csv(DATA_DIR  + testing_fname)\n",
    "df = df1.append(df2)\n",
    "\n",
    "\n",
    "label_column = ['label']\n",
    "categorical_columns = ['proto', 'service', 'state']\n",
    "drop_columns = ['id', 'sttl', 'dttl', 'swin', 'dwin', 'trans_depth', 'response_body_len', 'ct_srv_src', 'ct_state_ttl', 'ct_dst_ltm', 'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm', 'is_ftp_login', 'ct_ftp_cmd', 'ct_flw_http_mthd', 'ct_src_ltm', 'ct_srv_dst', 'is_sm_ips_ports', 'attack_cat']\n",
    "numeric_columns = list(set(df.columns) - set(label_column) - set(categorical_columns) - set(drop_columns))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Train Test Split"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "y = df['label'].tolist()\n",
    "X = df\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": "pandas.core.frame.DataFrame"
     },
     "execution_count": 45,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "type(X_train)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Preprocessing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": "<class 'pandas.core.series.Series'>\n"
    },
    {
     "data": {
      "text/plain": "133"
     },
     "execution_count": 46,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "print(type(df['proto']))\n",
    "len(df['proto'].unique())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Scaling"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": "/Users/jgraham/anaconda3/envs/tf2/lib/python3.7/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: \nA value is trying to be set on a copy of a slice from a DataFrame.\nTry using .loc[row_indexer,col_indexer] = value instead\n\nSee the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n  \n/Users/jgraham/anaconda3/envs/tf2/lib/python3.7/site-packages/pandas/core/indexing.py:494: SettingWithCopyWarning: \nA value is trying to be set on a copy of a slice from a DataFrame.\nTry using .loc[row_indexer,col_indexer] = value instead\n\nSee the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n  self.obj[item] = s\n/Users/jgraham/anaconda3/envs/tf2/lib/python3.7/site-packages/ipykernel_launcher.py:3: SettingWithCopyWarning: \nA value is trying to be set on a copy of a slice from a DataFrame.\nTry using .loc[row_indexer,col_indexer] = value instead\n\nSee the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n  This is separate from the ipykernel package so we can avoid doing imports until\n/Users/jgraham/anaconda3/envs/tf2/lib/python3.7/site-packages/pandas/core/indexing.py:494: SettingWithCopyWarning: \nA value is trying to be set on a copy of a slice from a DataFrame.\nTry using .loc[row_indexer,col_indexer] = value instead\n\nSee the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n  self.obj[item] = s\n"
    }
   ],
   "source": [
    "scaler = sklearn.preprocessing.MinMaxScaler()\n",
    "X_train[numeric_columns] = scaler.fit_transform(X_train[numeric_columns])\n",
    "X_test[numeric_columns] = scaler.fit_transform(X_test[numeric_columns])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Categorical Columns Hash Trick"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [],
   "source": [
    "# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.FeatureHasher.html\n",
    "def column_hash_trick(categorical_column, n_hash_features):\n",
    "    \"\"\"Turns categorical columns into hashed with hash trick.\n",
    "    \n",
    "    Args:\n",
    "        categorical_column: Pandas Series.\n",
    "        n_hash_features: Integer for number of hash features desired.\n",
    "        \n",
    "    Returns:\n",
    "        scipy.sparse.csr.csr_matrix of hash features.\n",
    "    \n",
    "    \"\"\"\n",
    "    categories = categorical_column.unique()\n",
    "    h = FeatureHasher(n_features=n_hash_features, input_type='string')\n",
    "    return h.transform(categories)\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Add to dataframe\n",
    "#pd.concat([df[['Genre', 'Publisher']], pd.DataFrame(hashed_features1),pd.DataFrame(hashed_features2)],\n",
    "#axis=1)\n",
    "from sklearn.compose import ColumnTransformer\n",
    "ct = ColumnTransformer([('hash_proto', FeatureHasher(5, input_type='string'), 'proto'),\n",
    "                      ('hash_service', FeatureHasher(5, input_type='string'), 'service'),\n",
    "                      ('hash_state', FeatureHasher(5, input_type='string'), 'state'),\n",
    "                      ('numeric_cols', 'passthrough', numeric_columns),\n",
    "                       ('dropped', 'drop', drop_columns),\n",
    "                       ('label_drop', 'drop', 'label')])\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train = ct.fit_transform(X_train)\n",
    "X_test = ct.fit_transform(X_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [
    {
     "ename": "NameError",
     "evalue": "name 'result' is not defined",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-51-4a99d8abe300>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mresult\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[0;31mNameError\u001b[0m: name 'result' is not defined"
     ]
    }
   ],
   "source": [
    "result.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Random Forest Training"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.ensemble import RandomForestClassifier\n",
    "\n",
    "rnd_clf = RandomForestClassifier(n_estimators=800, max_leaf_nodes=16, random_state=42)\n",
    "rnd_clf.fit(X_train, y_train)\n",
    "\n",
    "y_pred_rf = rnd_clf.predict(X_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": "array([1, 1, 0, ..., 1, 1, 0])"
     },
     "execution_count": 53,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y_pred_rf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": "0.8969248203169872"
     },
     "execution_count": 54,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.sum(y_test == y_pred_rf) / len(y_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": "0.8969248203169872"
     },
     "execution_count": 55,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rnd_clf.score(X_test, y_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}